{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 生成式对话机器人"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step1 导入相关包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = '/data/datasets/alpaca_data_zh/alpaca_gpt4_data_zh.json'\n",
    "pretrain_model_dir = \"/data/models/huggingface/bloom-389m-zh\"\n",
    "save_dir = '/data/logs/conversation_rebot'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\Miniconda\\envs\\geo\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
      "d:\\Miniconda\\envs\\geo\\lib\\site-packages\\numpy\\.libs\\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll\n",
      "d:\\Miniconda\\envs\\geo\\lib\\site-packages\\numpy\\.libs\\libopenblas64__v0.3.21-gcc_10_3_0.dll\n",
      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n"
     ]
    }
   ],
   "source": [
    "from datasets import Dataset,load_dataset\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step2 加载数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['instruction', 'output', 'input'],\n",
      "    num_rows: 48817\n",
      "})\n",
      "1\n"
     ]
    }
   ],
   "source": [
    "# ds = Dataset.load_from_disk(\"./alpaca_data_zh/\")\n",
    "# 加载数据集\n",
    "datasets = load_dataset('json', data_files=data_dir,split='train')\n",
    "datasets = datasets.train_test_split(test_size=0.000001)\n",
    "ds = datasets[\"train\"]\n",
    "print(ds)\n",
    "print(len(datasets[\"test\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'instruction': ['用给定的正则表达式创建一个示例字符串。', '说一部有蝎子的电影', '命名一个为孩子们树立榜样的著名运动员。'],\n",
       " 'output': ['输出：1234-5678-9012-3456\\n\\n说明：此正则表达式与格式为 \"四个数字后跟一条破折号或空格，重复四次\" 的字符串匹配。 \"1234-5678-9012-3456\" 是一个匹配此模式的例子字符串。',\n",
       "  '《蝎子王》（The Scorpion King）是一部由查克·拉塞尔执导的美国片，于2002年上映。影片讲述了一个关于英勇勇士马阿蒂斯（The Rock饰）率领沙漠游牧民族反抗残暴暴君的故事。这里的“蝎子王”指的就是主角马阿蒂斯，他是一位像蝎子一样狡猾、冷酷的勇士。',\n",
       "  '科比·布莱恩特（Kobe Bryant）是一个杰出的运动员，他在职业篮球领域取得了巨大的成功，对孩子们树立了一个榜样。他不仅作为洛杉矶湖人队的核心球员获得了5次NBA总冠军，还是一位18次全明星球员和2届奥运会金牌得主。科比·布莱恩特以其职业精神、不懈的努力和坚定的决心闻名于世，并一直致力于通过篮球培养下一代。他创立了科比和凡妮莎布莱恩特家庭基金会，旨在帮助年轻人获得机会，实现他们的梦想。他的精神对于孩子们来说是一个很好的榜样。'],\n",
       " 'input': ['输入：\\\\d{4}[- ]\\\\d{4}[- ]\\\\d{4}[- ]\\\\d{4}', '', '']}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step3 数据集预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(pretrain_model_dir)\n",
    "tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "def process_func(example):\n",
    "    MAX_LENGTH = 1800    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性\n",
    "    input_ids, attention_mask, labels = [], [], []\n",
    "    instruction = tokenizer(f\"<|im_start|>system\\n你是一个逻辑推理专家，擅长解决逻辑推理问题。<|im_end|>\\n<|im_start|>user\\n{example['instruction'] + example['input']}<|im_end|>\\n<|im_start|>assistant\\n\", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens\n",
    "    response = tokenizer(f\"{example['output']}\", add_special_tokens=False)\n",
    "    input_ids = instruction[\"input_ids\"] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
    "    attention_mask = instruction[\"attention_mask\"] + response[\"attention_mask\"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1\n",
    "    labels = [-100] * len(instruction[\"input_ids\"]) + response[\"input_ids\"] + [tokenizer.pad_token_id]  \n",
    "    if len(input_ids) > MAX_LENGTH:  # 做一个截断\n",
    "        input_ids = input_ids[:MAX_LENGTH]\n",
    "        attention_mask = attention_mask[:MAX_LENGTH]\n",
    "        labels = labels[:MAX_LENGTH]\n",
    "    return {\n",
    "        \"input_ids\": input_ids,\n",
    "        \"attention_mask\": attention_mask,\n",
    "        \"labels\": labels\n",
    "    }\n",
    "\"\"\"\n",
    "def process_func(example):\n",
    "    MAX_LENGTH = 2048\n",
    "    input_ids, attention_mask, labels = [], [], []\n",
    "    instruction = tokenizer(\"\\n\".join([\"Human: \" + example[\"instruction\"], example[\"input\"]]).strip() + \"\\n\\nAssistant: \")\n",
    "    response = tokenizer(example[\"output\"] + tokenizer.eos_token)\n",
    "    input_ids = instruction[\"input_ids\"] + response[\"input_ids\"]\n",
    "    attention_mask = instruction[\"attention_mask\"] + response[\"attention_mask\"]\n",
    "    labels = [-100] * len(instruction[\"input_ids\"]) + response[\"input_ids\"]\n",
    "    if len(input_ids) > MAX_LENGTH:\n",
    "        input_ids = input_ids[:MAX_LENGTH]\n",
    "        attention_mask = attention_mask[:MAX_LENGTH]\n",
    "        labels = labels[:MAX_LENGTH]\n",
    "    return {\n",
    "        \"input_ids\": input_ids,\n",
    "        \"attention_mask\": attention_mask,\n",
    "        \"labels\": labels\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)\n",
    "tokenized_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.decode(tokenized_ds[1][\"input_ids\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1][\"labels\"])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step4 创建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(pretrain_model_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step5 配置训练参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    output_dir=save_dir,\n",
    "    per_device_train_batch_size=1,\n",
    "    per_device_eval_batch_size=1,\n",
    "    gradient_accumulation_steps=4,\n",
    "    logging_steps=10,\n",
    "    num_train_epochs=6\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step6 创建训练器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    tokenizer=tokenizer,\n",
    "    train_dataset=tokenized_ds,\n",
    "    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step7 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step8 模型推理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'generated_text': 'Human: 考试有哪些技巧？\\n\\nAssistant: 考试有很多种不同的技巧，下面是一些常用的技巧：\\n\\n1. 制定一个复习计划：制定一个复习计划，可以帮助你更好地安排时间和保持学习进度。\\n\\n2. 集中注意力：在学习过程中，要集中注意力，避免分心。可以使用番茄工作法或其他时间管理技巧来帮助集中注意力。\\n\\n3. 适当休息：在学习期间适当休息，可以帮助你恢复精力，更好地保持学习进度。\\n\\n4. 多样化学习方法：可以尝试不同的学习方法，如阅读、听力、演示等，以帮助你更好地理解知识。\\n\\n5. 避免分心：学习过程中，要尽量避免分心，例如关闭手机、电脑等，可以帮助你更好地集中注意力。\\n\\n6. 保持积极心态：学习过程中，要保持积极的心态，不要过于担忧考试结果。\\n\\n7. 及时反馈：要及时向老师和同学反馈学习进度，可以帮助你更好地调整学习计划。\\n\\n8. 注意考前准备：在考试前，要做好充分的准备，包括复习资料、模拟考试等，可以帮助你更好地准备考试。\\n\\n以上是一些常用的考试技巧，希望能对你有所帮助。'}]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ipt = \"Human: {}\\n{}\".format(\"考试有哪些技巧？\", \"\").strip() + \"\\n\\nAssistant: \"\n",
    "pipe(ipt, \n",
    "    # max_length=512, \n",
    "    do_sample=True, \n",
    "    max_new_tokens=256,\n",
    "    top_p=0.95,\n",
    "    temperature=0.7,\n",
    "    num_beams=2\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "transformers",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
